/*
 *  0x.Tools xcapture.bt v0.4 - Proof-of-concept prototype for sampling 
 *                              Linux thread activity using eBPF [0x.tools]
 *
 *  Copyright 2019-2023 Tanel Poder
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 *  SPDX-License-Identifier: GPL-2.0-or-later
 *
 */

// This is a PoC prototype for demonstrating feasibility of the custom, programmable
// task state object populating + sampling approach. This script is not complete and 
// it probably has bugs. I have plenty of improvements in It's not a finished tool or a product.
//
// To avoid the extremely slow stack address to symbol resolution in bpftrace, enable
// symbol caching, for example:
//
// sudo BPFTRACE_CACHE_USER_SYMBOLS=1 bpftrace xcapture.bt
// or
// sudo BPFTRACE_CACHE_USER_SYMBOLS=1 bpftrace -f json xcapture.bt > out.json

BEGIN {
    @TASK_STATES[0x00] = "R"; // "(running)"
    @TASK_STATES[0x01] = "S"; // "(sleeping)"
    @TASK_STATES[0x02] = "D"; // "(disk sleep)"
    @TASK_STATES[0x04] = "T"; // "(stopped)"
    @TASK_STATES[0x08] = "t"; // "(tracing stop)"
    @TASK_STATES[0x10] = "X"; // "(dead)"
    @TASK_STATES[0x20] = "Z"; // "(zombie)"
    @TASK_STATES[0x40] = "P"; // "(parked)"
    @TASK_STATES[0x80] = "I"; // "(idle)"
}


// record system calls by threads into the thread state array
// ideally/eventually need to move pid/uid/gid (and perhaps comm) assignment out of the syscall probe
tracepoint:raw_syscalls:sys_enter {
    // [tid] uses thread local storage, cleaned out automatically on thread exit
    @pid            [tid] = pid; // *in bpftrace* tid means thread ID (task ID), pid means Process ID (thread group ID)
    @uid            [tid] = uid;
    @gid            [tid] = gid;
    @comm           [tid] = comm;
    @cmdline        [tid] = str(uptr(curtask->mm->arg_start));
    @task_state     [tid] = @TASK_STATES[curtask->__state & 0xff];
    @syscall_id     [tid] = args->id;
    @syscall_args   [tid] = (args->args[0], args->args[1], args->args[2], args->args[3], args->args[4], args->args[5]);
    @syscall_ustack [tid] = ustack();
}

tracepoint:raw_syscalls:sys_exit {
    delete(@syscall_id[tid]) // @syscall_id [tid] = -1;
}


// thread requests going off CPU
// by the time schedule() is called, the caller has set the new task state
kprobe:schedule {
    @task_state    [tid] = @TASK_STATES[curtask->__state & 0xff];
    @offcpu_ustack [tid] = ustack();
    @offcpu_kstack [tid] = kstack();
}

// thread has been put back on CPU
// newer kernels have the "isra" version of this function name, thus the * wildcard
kprobe:finish_task_switch* {
    @task_state    [tid] = @TASK_STATES[curtask->__state & 0xff];
    delete(@offcpu_ustack[tid]); 
    delete(@offcpu_kstack[tid]); 
}

// sampled profiling of on-CPU threads
// update the stack id of threads currently running on (any) cpu
profile:hz:1 {
    @task_state    [tid] = @TASK_STATES[curtask->__state & 0xff];
    @profile_ustack[tid] = ustack(); 
    @profile_kstack[tid] = kstack();
}

// Context enrichment example (kernel): tasks waiting in the CPU runqueue
tracepoint:sched:sched_wakeup,
tracepoint:sched:sched_wakeup_new {
    @sched_wakeup[args->pid] = 1;
}

tracepoint:sched:sched_switch {
    delete(@sched_wakeup[args->next_pid]); // or: @sched_wakeup[args->next_pid] = -1;
}

tracepoint:sched:sched_process_exit {
    delete(@pid            [args->pid]);
    delete(@uid            [args->pid]);
    delete(@gid            [args->pid]);
    delete(@comm           [args->pid]);
    delete(@cmdline        [args->pid]);
    delete(@task_state     [args->pid]);
    delete(@syscall_id     [args->pid]);
    delete(@syscall_args   [args->pid]);
    delete(@syscall_ustack [args->pid]);
    delete(@sched_wakeup   [args->pid]);
}


// Context enrichment example (application): Oracle database wait events
uprobe:/u01/app/oracle/product/19.0.0/dbhome_1/bin/oracle:kskthbwt {
    $EVENT_NAME_ARRAY_START=(uint64 *) *uptr(0x600069f0); // uaddr("ksledt_") gave error...
    $EVENT_NAME_SLOT_SIZE=(uint64) 56;                    // sizeof(struct)

    @oracle_wait_event[tid] = str(*uptr($EVENT_NAME_ARRAY_START + ($EVENT_NAME_SLOT_SIZE * arg1)/8));
}

uprobe:/u01/app/oracle/product/19.0.0/dbhome_1/bin/oracle:kskthewt {
    delete(@oracle_wait_event[tid]);                      // @oracle_wait_event[tid] = -1;
}


// write out SAMPLES of thread states & activity
// interval is executed on 1 CPU only, so we won't emit duplicates
interval:hz:1 {
    @SAMPLE_TIME=strftime("\"%Y-%m-%dT%H:%M:%S.%f\"", nsecs); // extra "" for json output
    print(@SAMPLE_TIME);

    print(@pid);
    print(@comm);
    print(@cmdline);
    print(@task_state);
    print(@syscall_id);
    print(@syscall_args);
    print(@profile_ustack);
    print(@profile_kstack);
    print(@syscall_ustack);
    print(@offcpu_ustack); 
    print(@offcpu_kstack); 
    print(@sched_wakeup);
    print(@oracle_wait_event);
}

END {
    clear(@SAMPLE_TIME);
    clear(@TASK_STATES);
    clear(@pid);
    clear(@uid);
    clear(@gid);
    clear(@comm);
    clear(@cmdline);
    clear(@profile_ustack);
    clear(@profile_kstack);
    clear(@syscall_ustack);
    clear(@offcpu_ustack); 
    clear(@offcpu_kstack); 
    clear(@sched_wakeup);
    clear(@syscall_id);
    clear(@syscall_args);
    clear(@task_state);
    clear(@oracle_wait_event);
}

// TODO: 
// ---------------------------------------------------------------------------------------------
// There's *plenty* to do! If you know bcc/libbpf and are interested in helping out, ping me :-)
//
// Email: tanel@tanelpoder.com
//
// PRINTOUT NOTES:
// ----------------------------------------------------------------------------------------------
// "Kernel: 5.3 bpftrace supports C style while loops:
// bpftrace -e 'i:ms:100 { $i = 0; while ($i <= 100) { printf("%d ", $i); $i++} exit(); }'
// Loops can be short circuited by using the continue and break keywords."
// 
// Unfortunately bpftrace doesn't (yet?) support iterating through only the existing (populated)
// elements in hash maps, we don't want to loop from 1 to pid_max every time we emit output!
//
// Thus, we need to use bcc/libbpf for the sampling loops or use bpftool to dump or mount 
// the kernel ebpf maps as files and do our reading / sampling from there.
//
// Since we don't want to always emit/print every single task, but would rather have some 
// conditional logic & intelligence of what threads are interesting (a'la only print R & D states
// and some specific syscalls under S state), it's better to push this decision logic down to
// kernel. This means bcc or more likely libbpf as mentioned above.
//
// DATA STRUCTURE NOTES:
// ----------------------------------------------------------------------------------------------
// With bcc/libbpf it's likely possible to use a hashmap of structs (or hashmap of maps) for 
// storing each thread's complete state in a single thread state "array", under a single TID key.
// This should reduce any timing & "read consistency" issues when sampling/emitting records too.
//
/* vi:syntax=c */
/* vi:filetype=c */
